import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
df_train = pd.read_csv('../data/for_modelling/credit_score_train.csv')
df_val = pd.read_csv('../data/for_modelling/credit_score_valid.csv')
df=pd.concat([df_train, df_val])
#df=df_train.copy()
df.head()
| INCOME | SAVINGS | DEBT | R_SAVINGS_INCOME | R_DEBT_INCOME | R_DEBT_SAVINGS | T_CLOTHING_12 | T_CLOTHING_6 | R_CLOTHING | R_CLOTHING_INCOME | ... | R_EXPENDITURE_SAVINGS | R_EXPENDITURE_DEBT | CAT_GAMBLING | CAT_DEBT | CAT_CREDIT_CARD | CAT_MORTGAGE | CAT_SAVINGS_ACCOUNT | CAT_DEPENDENTS | CREDIT_SCORE | DEFAULT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2783 | 1855 | 0 | 0.6665 | 0.00 | 0.0000 | 103 | 74 | 0.7184 | 0.0370 | ... | 2.5003 | 0.0000 | No | 0 | 0 | 0 | 1 | 0 | 570 | 0 |
| 1 | 314430 | 445442 | 707468 | 1.4167 | 2.25 | 1.5882 | 35861 | 29157 | 0.8131 | 0.1141 | ... | 0.5882 | 0.3704 | High | 1 | 0 | 1 | 1 | 0 | 691 | 0 |
| 2 | 161773 | 517674 | 2782496 | 3.2000 | 17.20 | 5.3750 | 3716 | 2533 | 0.6816 | 0.0230 | ... | 0.3125 | 0.0581 | No | 1 | 1 | 1 | 1 | 1 | 520 | 0 |
| 3 | 16014 | 97685 | 20818 | 6.1000 | 1.30 | 0.2131 | 637 | 187 | 0.2936 | 0.0398 | ... | 0.1639 | 0.7692 | No | 1 | 0 | 0 | 1 | 0 | 654 | 0 |
| 4 | 193225 | 1410542 | 2589215 | 7.3000 | 13.40 | 1.8356 | 5276 | 2325 | 0.4407 | 0.0273 | ... | 0.1370 | 0.0746 | No | 1 | 1 | 0 | 1 | 1 | 552 | 0 |
5 rows × 86 columns
# let's set our target and look into it
target='DEFAULT'
df[target].value_counts()
DEFAULT 0 450 1 178 Name: count, dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 628 entries, 0 to 137 Data columns (total 86 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 INCOME 628 non-null int64 1 SAVINGS 628 non-null int64 2 DEBT 628 non-null int64 3 R_SAVINGS_INCOME 628 non-null float64 4 R_DEBT_INCOME 628 non-null float64 5 R_DEBT_SAVINGS 628 non-null float64 6 T_CLOTHING_12 628 non-null int64 7 T_CLOTHING_6 628 non-null int64 8 R_CLOTHING 628 non-null float64 9 R_CLOTHING_INCOME 628 non-null float64 10 R_CLOTHING_SAVINGS 628 non-null float64 11 R_CLOTHING_DEBT 628 non-null float64 12 T_EDUCATION_12 628 non-null int64 13 T_EDUCATION_6 628 non-null int64 14 R_EDUCATION 628 non-null float64 15 R_EDUCATION_INCOME 628 non-null float64 16 R_EDUCATION_SAVINGS 628 non-null float64 17 R_EDUCATION_DEBT 628 non-null float64 18 T_ENTERTAINMENT_12 628 non-null int64 19 T_ENTERTAINMENT_6 628 non-null int64 20 R_ENTERTAINMENT 628 non-null float64 21 R_ENTERTAINMENT_INCOME 628 non-null float64 22 R_ENTERTAINMENT_SAVINGS 628 non-null float64 23 R_ENTERTAINMENT_DEBT 628 non-null float64 24 T_FINES_12 628 non-null int64 25 T_FINES_6 628 non-null int64 26 R_FINES 628 non-null float64 27 R_FINES_INCOME 628 non-null float64 28 R_FINES_SAVINGS 628 non-null float64 29 R_FINES_DEBT 628 non-null float64 30 T_GAMBLING_12 628 non-null int64 31 T_GAMBLING_6 628 non-null int64 32 R_GAMBLING 628 non-null float64 33 R_GAMBLING_INCOME 628 non-null float64 34 R_GAMBLING_SAVINGS 628 non-null float64 35 R_GAMBLING_DEBT 628 non-null float64 36 T_GROCERIES_12 628 non-null int64 37 T_GROCERIES_6 628 non-null int64 38 R_GROCERIES 628 non-null float64 39 R_GROCERIES_INCOME 628 non-null float64 40 R_GROCERIES_SAVINGS 628 non-null float64 41 R_GROCERIES_DEBT 628 non-null float64 42 T_HEALTH_12 628 non-null int64 43 T_HEALTH_6 628 non-null int64 44 R_HEALTH 628 non-null float64 45 R_HEALTH_INCOME 628 non-null float64 46 R_HEALTH_SAVINGS 628 non-null float64 47 R_HEALTH_DEBT 628 non-null float64 48 T_HOUSING_12 628 non-null int64 49 T_HOUSING_6 628 non-null int64 50 R_HOUSING 628 non-null float64 51 R_HOUSING_INCOME 628 non-null float64 52 R_HOUSING_SAVINGS 628 non-null float64 53 R_HOUSING_DEBT 628 non-null float64 54 T_TAX_12 628 non-null int64 55 T_TAX_6 628 non-null int64 56 R_TAX 628 non-null float64 57 R_TAX_INCOME 628 non-null float64 58 R_TAX_SAVINGS 628 non-null float64 59 R_TAX_DEBT 628 non-null float64 60 T_TRAVEL_12 628 non-null int64 61 T_TRAVEL_6 628 non-null int64 62 R_TRAVEL 628 non-null float64 63 R_TRAVEL_INCOME 628 non-null float64 64 R_TRAVEL_SAVINGS 628 non-null float64 65 R_TRAVEL_DEBT 628 non-null float64 66 T_UTILITIES_12 628 non-null int64 67 T_UTILITIES_6 628 non-null int64 68 R_UTILITIES 628 non-null float64 69 R_UTILITIES_INCOME 628 non-null float64 70 R_UTILITIES_SAVINGS 628 non-null float64 71 R_UTILITIES_DEBT 628 non-null float64 72 T_EXPENDITURE_12 628 non-null int64 73 T_EXPENDITURE_6 628 non-null int64 74 R_EXPENDITURE 628 non-null float64 75 R_EXPENDITURE_INCOME 628 non-null float64 76 R_EXPENDITURE_SAVINGS 628 non-null float64 77 R_EXPENDITURE_DEBT 628 non-null float64 78 CAT_GAMBLING 628 non-null object 79 CAT_DEBT 628 non-null int64 80 CAT_CREDIT_CARD 628 non-null int64 81 CAT_MORTGAGE 628 non-null int64 82 CAT_SAVINGS_ACCOUNT 628 non-null int64 83 CAT_DEPENDENTS 628 non-null int64 84 CREDIT_SCORE 628 non-null int64 85 DEFAULT 628 non-null int64 dtypes: float64(51), int64(34), object(1) memory usage: 426.8+ KB
There are no missing values in our dataset :))
In our dataset there is only one categorical variable - 'CAT_GAMBLING'.
df['CAT_GAMBLING'].value_counts()
CAT_GAMBLING No 392 High 163 Low 73 Name: count, dtype: int64
We will use ordinal encoding for this variable
df['CAT_GAMBLING'] = df['CAT_GAMBLING'].map({'No': 0, 'Low': 1,'High': 2})
df
| INCOME | SAVINGS | DEBT | R_SAVINGS_INCOME | R_DEBT_INCOME | R_DEBT_SAVINGS | T_CLOTHING_12 | T_CLOTHING_6 | R_CLOTHING | R_CLOTHING_INCOME | ... | R_EXPENDITURE_SAVINGS | R_EXPENDITURE_DEBT | CAT_GAMBLING | CAT_DEBT | CAT_CREDIT_CARD | CAT_MORTGAGE | CAT_SAVINGS_ACCOUNT | CAT_DEPENDENTS | CREDIT_SCORE | DEFAULT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2783 | 1855 | 0 | 0.6665 | 0.0000 | 0.0000 | 103 | 74 | 0.7184 | 0.0370 | ... | 2.5003 | 0.0000 | 0 | 0 | 0 | 0 | 1 | 0 | 570 | 0 |
| 1 | 314430 | 445442 | 707468 | 1.4167 | 2.2500 | 1.5882 | 35861 | 29157 | 0.8131 | 0.1141 | ... | 0.5882 | 0.3704 | 2 | 1 | 0 | 1 | 1 | 0 | 691 | 0 |
| 2 | 161773 | 517674 | 2782496 | 3.2000 | 17.2000 | 5.3750 | 3716 | 2533 | 0.6816 | 0.0230 | ... | 0.3125 | 0.0581 | 0 | 1 | 1 | 1 | 1 | 1 | 520 | 0 |
| 3 | 16014 | 97685 | 20818 | 6.1000 | 1.3000 | 0.2131 | 637 | 187 | 0.2936 | 0.0398 | ... | 0.1639 | 0.7692 | 0 | 1 | 0 | 0 | 1 | 0 | 654 | 0 |
| 4 | 193225 | 1410542 | 2589215 | 7.3000 | 13.4000 | 1.8356 | 5276 | 2325 | 0.4407 | 0.0273 | ... | 0.1370 | 0.0746 | 0 | 1 | 1 | 0 | 1 | 1 | 552 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 133 | 386976 | 1250231 | 654883 | 3.2308 | 1.6923 | 0.5238 | 36665 | 30569 | 0.8337 | 0.0947 | ... | 0.2381 | 0.4545 | 0 | 1 | 0 | 0 | 1 | 0 | 562 | 0 |
| 134 | 16772 | 8386 | 0 | 0.5000 | 0.0000 | 0.0000 | 238 | 186 | 0.7815 | 0.0142 | ... | 2.5000 | 0.9786 | 0 | 0 | 0 | 0 | 1 | 0 | 619 | 0 |
| 135 | 137509 | 206264 | 206264 | 1.5000 | 1.5000 | 1.0000 | 15661 | 10865 | 0.6938 | 0.1139 | ... | 0.6667 | 0.6667 | 0 | 1 | 0 | 0 | 1 | 0 | 639 | 1 |
| 136 | 259146 | 1922000 | 1662854 | 7.4167 | 6.4167 | 0.8652 | 12180 | 8527 | 0.7001 | 0.0470 | ... | 0.1124 | 0.1299 | 2 | 1 | 1 | 0 | 1 | 1 | 587 | 1 |
| 137 | 56657 | 442952 | 51506 | 7.8181 | 0.9091 | 0.1163 | 1085 | 289 | 0.2664 | 0.0192 | ... | 0.1163 | 1.0000 | 0 | 1 | 0 | 0 | 1 | 0 | 635 | 1 |
628 rows × 86 columns
#let's prepare dataset without 'DEFAULT', binary variables and 'Cat_gambling' because it was categorical
#we will use it often in the next steps
binary_cols = [col for col in df.columns if df[col].nunique() == 2]
rest_cols = [col for col in df.columns if col not in binary_cols]
rest_cols.remove('CAT_GAMBLING')
# for col in rest_cols:
# sns.boxplot(x=df[col])
# plt.show()
There are too many columns in our dataset so boxplots won't be legible. We will try to detect outliers using other methods.
from pyod.models.knn import KNN
clf = KNN(contamination=0.04)
clf.fit(df[rest_cols])
df['outliers'] = clf.labels_
df['outliers'].value_counts()
outliers 0 602 1 26 Name: count, dtype: int64
This is automatic detection of outliers. We assume that 4% of our data is as outliers. (we checked different values and 4% seems to be the best) Let's check if we can find some outliers manually.
# we are showing scatterplots for all features except 'DEFAULT', binary variables and 'Cat_gambling' because it was categorical
df_without_binary = [col for col in df.columns if df[col].nunique() > 2]
columns_to_scatter_plot = [col for col in df_without_binary if col not in ['CREDIT_SCORE', 'CAT_GAMBLING', 'DEFAULT']]
fig, axs = plt.subplots(26, 3, figsize=(16, 100))
axs = axs.flatten()
for i, col in enumerate(columns_to_scatter_plot):
sns.scatterplot(data=df, x=col, y="CREDIT_SCORE", ax=axs[i], s=15, hue='DEFAULT')
plt.tight_layout()
plt.show()
Outliers:
Let's remove these outliers
# Definiowanie granic dla outlierów
outliers_dict = {
'T_CLOTHING_6': 37000,
'R_CLOTHING_SAVINGS': 1.50,
'R_CLOTHING_DEBT': 1.0,
'R_EDUCATION_SAVINGS': 1.7,
'R_EDUCATION_DEBT': 0.3,
'R_ENTERTAINMENT_INCOME': 1.2,
'R_ENTERTAINMENT_SAVINGS': 6,
'R_ENTERTAINMENT_DEBT': 2.0,
'R_FINES_INCOME': 0.02,
'R_FINES_SAVINGS': 0.05,
'R_FINES_DEBT': 0.02,
'R_GAMBLING_INCOME': 0.2,
'R_GAMBLING_SAVINGS': 0.8,
'R_GAMBLING_DEBT': 0.15,
'R_GROCERIES_SAVINGS': 3.5,
'T_HEALTH_12': 40000,
'T_HEALTH_6': 25000,
'R_HEALTH_INCOME': 0.3,
'R_HEALTH_SAVINGS': 0.8,
'R_HOUSING_DEBT': 3,
'R_TAX_DEBT': 0.15
}
for col, threshold in outliers_dict.items():
df_without_outliers = df[df[col] <= threshold]
df_without_outliers.reset_index(drop=True, inplace=True)
df_without_outliers
| INCOME | SAVINGS | DEBT | R_SAVINGS_INCOME | R_DEBT_INCOME | R_DEBT_SAVINGS | T_CLOTHING_12 | T_CLOTHING_6 | R_CLOTHING | R_CLOTHING_INCOME | ... | R_EXPENDITURE_DEBT | CAT_GAMBLING | CAT_DEBT | CAT_CREDIT_CARD | CAT_MORTGAGE | CAT_SAVINGS_ACCOUNT | CAT_DEPENDENTS | CREDIT_SCORE | DEFAULT | outliers | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2783 | 1855 | 0 | 0.6665 | 0.0000 | 0.0000 | 103 | 74 | 0.7184 | 0.0370 | ... | 0.0000 | 0 | 0 | 0 | 0 | 1 | 0 | 570 | 0 | 0 |
| 1 | 314430 | 445442 | 707468 | 1.4167 | 2.2500 | 1.5882 | 35861 | 29157 | 0.8131 | 0.1141 | ... | 0.3704 | 2 | 1 | 0 | 1 | 1 | 0 | 691 | 0 | 0 |
| 2 | 161773 | 517674 | 2782496 | 3.2000 | 17.2000 | 5.3750 | 3716 | 2533 | 0.6816 | 0.0230 | ... | 0.0581 | 0 | 1 | 1 | 1 | 1 | 1 | 520 | 0 | 0 |
| 3 | 16014 | 97685 | 20818 | 6.1000 | 1.3000 | 0.2131 | 637 | 187 | 0.2936 | 0.0398 | ... | 0.7692 | 0 | 1 | 0 | 0 | 1 | 0 | 654 | 0 | 0 |
| 4 | 193225 | 1410542 | 2589215 | 7.3000 | 13.4000 | 1.8356 | 5276 | 2325 | 0.4407 | 0.0273 | ... | 0.0746 | 0 | 1 | 1 | 0 | 1 | 1 | 552 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 621 | 386976 | 1250231 | 654883 | 3.2308 | 1.6923 | 0.5238 | 36665 | 30569 | 0.8337 | 0.0947 | ... | 0.4545 | 0 | 1 | 0 | 0 | 1 | 0 | 562 | 0 | 0 |
| 622 | 16772 | 8386 | 0 | 0.5000 | 0.0000 | 0.0000 | 238 | 186 | 0.7815 | 0.0142 | ... | 0.9786 | 0 | 0 | 0 | 0 | 1 | 0 | 619 | 0 | 0 |
| 623 | 137509 | 206264 | 206264 | 1.5000 | 1.5000 | 1.0000 | 15661 | 10865 | 0.6938 | 0.1139 | ... | 0.6667 | 0 | 1 | 0 | 0 | 1 | 0 | 639 | 1 | 0 |
| 624 | 259146 | 1922000 | 1662854 | 7.4167 | 6.4167 | 0.8652 | 12180 | 8527 | 0.7001 | 0.0470 | ... | 0.1299 | 2 | 1 | 1 | 0 | 1 | 1 | 587 | 1 | 1 |
| 625 | 56657 | 442952 | 51506 | 7.8181 | 0.9091 | 0.1163 | 1085 | 289 | 0.2664 | 0.0192 | ... | 1.0000 | 0 | 1 | 0 | 0 | 1 | 0 | 635 | 1 | 0 |
626 rows × 87 columns
# histograms for our dataset
df[rest_cols].hist(bins=40, figsize=(20, 20))
plt.tight_layout()
plt.show()
Data transformation -Box Cox, standard
# use Box Cox transformation for rest_cols
from scipy.stats import boxcox
df_box_cox=df.copy()
for col in rest_cols:
df_box_cox[col] = boxcox(df_box_cox[col]+1)[0]
df_box_cox.head()
| INCOME | SAVINGS | DEBT | R_SAVINGS_INCOME | R_DEBT_INCOME | R_DEBT_SAVINGS | T_CLOTHING_12 | T_CLOTHING_6 | R_CLOTHING | R_CLOTHING_INCOME | ... | R_EXPENDITURE_DEBT | CAT_GAMBLING | CAT_DEBT | CAT_CREDIT_CARD | CAT_MORTGAGE | CAT_SAVINGS_ACCOUNT | CAT_DEPENDENTS | CREDIT_SCORE | DEFAULT | outliers | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 43.459601 | 28.600123 | 0.000000 | 0.495605 | 0.000000 | 0.000000 | 9.924372 | 7.003166 | 0.690181 | 0.027768 | ... | 0.000000 | 0 | 0 | 0 | 0 | 1 | 0 | 8.383897e+07 | 0 | 0 |
| 1 | 241.416057 | 162.327132 | 94.552153 | 0.837915 | 1.265938 | 0.793731 | 70.869805 | 36.115755 | 0.777806 | 0.052375 | ... | 0.217763 | 2 | 1 | 0 | 1 | 1 | 0 | 1.506710e+08 | 0 | 0 |
| 2 | 190.490817 | 169.976872 | 131.672996 | 1.319896 | 3.469615 | 1.316598 | 34.737136 | 19.786169 | 0.655954 | 0.019159 | ... | 0.052655 | 0 | 1 | 1 | 1 | 1 | 1 | 6.339584e+07 | 0 | 0 |
| 3 | 82.849139 | 101.686796 | 39.199909 | 1.749466 | 0.875886 | 0.186041 | 19.314106 | 9.495216 | 0.288280 | 0.029270 | ... | 0.303274 | 0 | 1 | 0 | 0 | 1 | 0 | 1.274213e+08 | 0 | 0 |
| 4 | 202.960902 | 230.871110 | 129.412824 | 1.872176 | 3.142675 | 0.855484 | 38.876541 | 19.351044 | 0.429239 | 0.022012 | ... | 0.065827 | 0 | 1 | 1 | 0 | 1 | 1 | 7.603568e+07 | 0 | 0 |
5 rows × 87 columns
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_box_cox[rest_cols] = scaler.fit_transform(df_box_cox[rest_cols])
# histogram of rest_cols
df_box_cox[rest_cols].hist(bins=40, figsize=(20, 20))
plt.tight_layout()
plt.show()
box cox transformation without outliers
# use Box Cox transformation for rest_cols
from scipy.stats import boxcox
df_box_cox_woo=df_without_outliers.copy()
for col in rest_cols:
df_box_cox_woo[col] = boxcox(df_box_cox_woo[col]+1)[0]
df_box_cox_woo.head()
| INCOME | SAVINGS | DEBT | R_SAVINGS_INCOME | R_DEBT_INCOME | R_DEBT_SAVINGS | T_CLOTHING_12 | T_CLOTHING_6 | R_CLOTHING | R_CLOTHING_INCOME | ... | R_EXPENDITURE_DEBT | CAT_GAMBLING | CAT_DEBT | CAT_CREDIT_CARD | CAT_MORTGAGE | CAT_SAVINGS_ACCOUNT | CAT_DEPENDENTS | CREDIT_SCORE | DEFAULT | outliers | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 43.539514 | 28.811891 | 0.000000 | 0.496158 | 0.000000 | 0.000000 | 9.911315 | 7.005131 | 0.693785 | 0.027736 | ... | 0.000000 | 0 | 0 | 0 | 0 | 1 | 0 | 8.252999e+07 | 0 | 0 |
| 1 | 242.205425 | 164.691595 | 95.550238 | 0.839518 | 1.266955 | 0.794490 | 70.620484 | 36.143759 | 0.782303 | 0.052229 | ... | 0.216475 | 2 | 1 | 0 | 1 | 1 | 0 | 1.482440e+08 | 0 | 0 |
| 2 | 191.073948 | 172.488174 | 133.240307 | 1.323959 | 3.476710 | 1.318901 | 34.646531 | 19.797235 | 0.659232 | 0.019144 | ... | 0.052592 | 0 | 1 | 1 | 1 | 1 | 1 | 6.242099e+07 | 0 | 0 |
| 3 | 83.043991 | 102.956985 | 39.484341 | 1.756750 | 0.876379 | 0.186079 | 19.276524 | 9.498531 | 0.288968 | 0.029234 | ... | 0.300408 | 0 | 1 | 0 | 0 | 1 | 0 | 1.253868e+08 | 0 | 0 |
| 4 | 203.593486 | 234.605521 | 130.943969 | 1.880567 | 3.148556 | 0.856376 | 38.769820 | 19.361727 | 0.430714 | 0.021992 | ... | 0.065727 | 0 | 1 | 1 | 0 | 1 | 1 | 7.485480e+07 | 0 | 0 |
5 rows × 87 columns
# do standard scaler for rest_cols
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_box_cox_woo[rest_cols] = scaler.fit_transform(df_box_cox_woo[rest_cols])
# histogram of rest_cols
df_box_cox_woo[rest_cols].hist(bins=40, figsize=(20, 20))
plt.tight_layout()
plt.show()
# scores for every method dataframe - cross validation
scores = pd.DataFrame(columns=['method', 'accuracy_type', 'accuracy_score', 'variance'])
We will try different modells on our dfs (after/before deleting outliers and using box cox) to check if our data preprocessing gives us better or worse results.
target="DEFAULT"
df_train=pd.read_csv('../data/for_modelling/credit_score_train.csv')
df_val=pd.read_csv('../data/for_modelling/credit_score_valid.csv')
# transform both train and valid datasets (BOX COX and Standard Scaler)
df_train2=df_train.copy()
df_val2=df_val.copy()
for col in rest_cols:
df_train2[col] = boxcox(df_train2[col]+1)[0]
df_val2[col] = boxcox(df_val2[col]+1)[0]
df_train2[rest_cols] = scaler.fit_transform(df_train2[rest_cols])
df_val2[rest_cols] = scaler.transform(df_val2[rest_cols])
# map CAT_GAMBLING (No-0, Low-1, High-2)
df_train2['CAT_GAMBLING'] = df_train2['CAT_GAMBLING'].map({'No':0, 'Low':1, 'High':2})
df_val2['CAT_GAMBLING'] = df_val2['CAT_GAMBLING'].map({'No':0, 'Low':1, 'High':2})
X=df_train2.drop(target, axis=1)
y=df_train2[target]
Dummy Classifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
# precision_score, recall_score, f1_score
from sklearn.metrics import precision_score, recall_score, f1_score
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X, y)
y_pred = dummy_clf.predict(df_val2.drop(target, axis=1))
acc=accuracy_score(df_val2[target], y_pred)
df_row=pd.DataFrame({'method':'Dummy_most_frequent', 'accuracy_type':'accuracy', 'accuracy_score':acc, 'variance':0}, index=[0])
scores=pd.concat([scores, df_row])
prec=precision_score(df_val2[target], y_pred)
df_row=pd.DataFrame({'method':'Dummy_most_frequent', 'accuracy_type':'precision', 'accuracy_score':prec, 'variance':0}, index=[0])
scores=pd.concat([scores, df_row])
rec=recall_score(df_val2[target], y_pred)
df_row=pd.DataFrame({'method':'Dummy_most_frequent', 'accuracy_type':'recall', 'accuracy_score':rec, 'variance':0}, index=[0])
scores=pd.concat([scores, df_row])
f1=f1_score(df_val2[target], y_pred)
df_row=pd.DataFrame({'method':'Dummy_most_frequent', 'accuracy_type':'f1', 'accuracy_score':f1, 'variance':0}, index=[0])
scores=pd.concat([scores, df_row])
dummy_clf = DummyClassifier(strategy="constant", constant=1)
dummy_clf.fit(X, y)
y_pred = dummy_clf.predict(df_val2.drop(target, axis=1))
acc=accuracy_score(df_val2[target], y_pred)
df_row=pd.DataFrame({'method':'Dummy_constant_1', 'accuracy_type':'accuracy', 'accuracy_score':acc, 'variance':0}, index=[0])
scores=pd.concat([scores, df_row])
prec=precision_score(df_val2[target], y_pred)
df_row=pd.DataFrame({'method':'Dummy_constant_1', 'accuracy_type':'precision', 'accuracy_score':prec, 'variance':0}, index=[0])
scores=pd.concat([scores, df_row])
rec=recall_score(df_val2[target], y_pred)
df_row=pd.DataFrame({'method':'Dummy_constant_1', 'accuracy_type':'recall', 'accuracy_score':rec, 'variance':0}, index=[0])
scores=pd.concat([scores, df_row])
f1=f1_score(df_val2[target], y_pred)
df_row=pd.DataFrame({'method':'Dummy_constant_1', 'accuracy_type':'f1', 'accuracy_score':f1, 'variance':0}, index=[0])
scores=pd.concat([scores, df_row])
scores
/opt/homebrew/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
| method | accuracy_type | accuracy_score | variance | |
|---|---|---|---|---|
| 0 | Dummy_most_frequent | accuracy | 0.717391 | 0 |
| 0 | Dummy_most_frequent | precision | 0.000000 | 0 |
| 0 | Dummy_most_frequent | recall | 0.000000 | 0 |
| 0 | Dummy_most_frequent | f1 | 0.000000 | 0 |
| 0 | Dummy_constant_1 | accuracy | 0.282609 | 0 |
| 0 | Dummy_constant_1 | precision | 0.282609 | 0 |
| 0 | Dummy_constant_1 | recall | 1.000000 | 0 |
| 0 | Dummy_constant_1 | f1 | 0.440678 | 0 |
SVC model
# SVC model
from sklearn.svm import SVC # another kernels than rbf and poly with 10 degrees are useless
# import classification report
# igonre wornings
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import classification_report
model = SVC(random_state=42, kernel='poly')
model.fit(X, y)
y_pred = model.predict(df_val2.drop(target, axis=1))
report=classification_report(df_val2[target], y_pred)
print(report)
precision recall f1-score support
0 0.00 0.00 0.00 99
1 0.28 1.00 0.44 39
accuracy 0.28 138
macro avg 0.14 0.50 0.22 138
weighted avg 0.08 0.28 0.12 138
model = SVC(random_state=42, kernel='linear')
model.fit(X, y)
y_pred = model.predict(df_val2.drop(target, axis=1))
report=classification_report(df_val2[target], y_pred)
print(report)
precision recall f1-score support
0 0.00 0.00 0.00 99
1 0.28 1.00 0.44 39
accuracy 0.28 138
macro avg 0.14 0.50 0.22 138
weighted avg 0.08 0.28 0.12 138
model= SVC(gamma=2, C=1, random_state=42)
model.fit(X, y)
y_pred = model.predict(df_val2.drop(target, axis=1))
report=classification_report(df_val2[target], y_pred)
print(report)
precision recall f1-score support
0 0.72 1.00 0.84 99
1 0.00 0.00 0.00 39
accuracy 0.72 138
macro avg 0.36 0.50 0.42 138
weighted avg 0.51 0.72 0.60 138
model_rbf = SVC(random_state=42, kernel='rbf')
model_rbf.fit(X, y)
y_pred = model_rbf.predict(df_val2.drop(target, axis=1))
report=classification_report(df_val2[target], y_pred)
print(report)
precision recall f1-score support
0 0.72 1.00 0.84 99
1 0.00 0.00 0.00 39
accuracy 0.72 138
macro avg 0.36 0.50 0.42 138
weighted avg 0.51 0.72 0.60 138
model_poly = SVC(random_state=42, kernel='poly', degree=10)
model_poly.fit(X, y)
y_pred = model_poly.predict(df_val2.drop(target, axis=1))
report=classification_report(df_val2[target], y_pred)
print(report)
precision recall f1-score support
0 0.72 1.00 0.84 99
1 0.00 0.00 0.00 39
accuracy 0.72 138
macro avg 0.36 0.50 0.42 138
weighted avg 0.51 0.72 0.60 138
from sklearn.model_selection import cross_val_score
df_cross_val = pd.concat([df_train2, df_val2])
X=df_cross_val.drop(target, axis=1)
y=df_cross_val[target]
rbf_scores=cross_val_score(model_rbf, X, y, cv=10, scoring='accuracy')
df_row=pd.DataFrame({'method':'SVC_rbf_box_cox_with_outliers', 'accuracy_type':'accuracy', 'accuracy_score':rbf_scores.mean(), 'variance':rbf_scores.var()}, index=[0])
scores=pd.concat([scores, df_row])
rbf_scores=cross_val_score(model_rbf, X, y, cv=10, scoring="precision")
df_row=pd.DataFrame({'method':'SVC_rbf_box_cox_with_outliers', 'accuracy_type':'precision', 'accuracy_score':rbf_scores.mean(), 'variance':rbf_scores.var()}, index=[0])
scores=pd.concat([scores, df_row])
rbf_scores=cross_val_score(model_rbf, X, y, cv=10, scoring="recall")
df_row=pd.DataFrame({'method':'SVC_rbf_box_cox_with_outliers', 'accuracy_type':'recall', 'accuracy_score':rbf_scores.mean(), 'variance':rbf_scores.var()}, index=[0])
scores=pd.concat([scores, df_row])
rbf_scores=cross_val_score(model_rbf, X, y, cv=10, scoring="f1")
df_row=pd.DataFrame({'method':'SVC_rbf_box_cox_with_outliers', 'accuracy_type':'f1', 'accuracy_score':rbf_scores.mean(), 'variance':rbf_scores.var()}, index=[0])
scores=pd.concat([scores, df_row])
scores
| method | accuracy_type | accuracy_score | variance | |
|---|---|---|---|---|
| 0 | Dummy_most_frequent | accuracy | 0.717391 | 0 |
| 0 | Dummy_most_frequent | precision | 0.000000 | 0 |
| 0 | Dummy_most_frequent | recall | 0.000000 | 0 |
| 0 | Dummy_most_frequent | f1 | 0.000000 | 0 |
| 0 | Dummy_constant_1 | accuracy | 0.282609 | 0 |
| 0 | Dummy_constant_1 | precision | 0.282609 | 0 |
| 0 | Dummy_constant_1 | recall | 1.000000 | 0 |
| 0 | Dummy_constant_1 | f1 | 0.440678 | 0 |
| 0 | SVC_rbf_box_cox_with_outliers | accuracy | 0.716590 | 0.000021 |
| 0 | SVC_rbf_box_cox_with_outliers | precision | 0.000000 | 0.0 |
| 0 | SVC_rbf_box_cox_with_outliers | recall | 0.000000 | 0.0 |
| 0 | SVC_rbf_box_cox_with_outliers | f1 | 0.000000 | 0.0 |
Random Forest model
# Random Forest model
from sklearn.ensemble import RandomForestClassifier
model_random_forest = RandomForestClassifier(random_state=42)
model_random_forest.fit(X, y)
for type in ['accuracy', 'precision', 'recall', 'f1']:
random_forest_scores=cross_val_score(model_random_forest, X, y, cv=10, scoring=type)
df_row=pd.DataFrame({'method':'Random_Forest_box_cox_with_outliers', 'accuracy_type':type, 'accuracy_score':random_forest_scores.mean(), 'variance':random_forest_scores.var()}, index=[0])
scores=pd.concat([scores, df_row])
# Decision Tree model
from sklearn.tree import DecisionTreeClassifier
model_decision_tree = DecisionTreeClassifier(random_state=42, max_depth=5)
model_decision_tree.fit(X, y)
for type in ['accuracy', 'precision', 'recall', 'f1']:
decision_tree_scores=cross_val_score(model_decision_tree, X, y, cv=10, scoring=type)
df_row=pd.DataFrame({'method':'Decision_Tree_box_cox_with_outliers', 'accuracy_type':type, 'accuracy_score':decision_tree_scores.mean(), 'variance':decision_tree_scores.var()}, index=[0])
scores=pd.concat([scores, df_row])
# ADA BOOST model
from sklearn.ensemble import AdaBoostClassifier
model_ada_boost = AdaBoostClassifier(random_state=42, algorithm='SAMME')
model_ada_boost.fit(X, y)
for type in ['accuracy', 'precision', 'recall', 'f1']:
ada_boost_scores=cross_val_score(model_ada_boost, X, y, cv=10, scoring=type)
df_row=pd.DataFrame({'method':'Ada_Boost_box_cox_with_outliers', 'accuracy_type':type, 'accuracy_score':ada_boost_scores.mean(), 'variance':ada_boost_scores.var()}, index=[0])
scores=pd.concat([scores, df_row])
# Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB
model_gaussian_nb = GaussianNB()
model_gaussian_nb.fit(X, y)
for type in ['accuracy', 'precision', 'recall', 'f1']:
gaussian_nb_scores=cross_val_score(model_gaussian_nb, X, y, cv=10, scoring=type)
df_row=pd.DataFrame({'method':'Gaussian_NB_box_cox_with_outliers', 'accuracy_type':type, 'accuracy_score':gaussian_nb_scores.mean(), 'variance':gaussian_nb_scores.var()}, index=[0])
scores=pd.concat([scores, df_row])
# Neural Network model
from sklearn.neural_network import MLPClassifier
model_neural_network = MLPClassifier(random_state=42, max_iter=1000)
model_neural_network.fit(X, y)
for type in ['accuracy', 'precision', 'recall', 'f1']:
neural_network_scores=cross_val_score(model_neural_network, X, y, cv=10, scoring=type)
df_row=pd.DataFrame({'method':'Neural_Network_box_cox_with_outliers', 'accuracy_type':type, 'accuracy_score':neural_network_scores.mean(), 'variance':neural_network_scores.var()}, index=[0])
scores=pd.concat([scores, df_row])
# logistic regression model
from sklearn.linear_model import LogisticRegression
model_logistic_regression = LogisticRegression(random_state=42)
model_logistic_regression.fit(X, y)
for type in ['accuracy', 'precision', 'recall', 'f1']:
logistic_regression_scores=cross_val_score(model_logistic_regression, X, y, cv=10, scoring=type)
df_row=pd.DataFrame({'method':'Logistic_Regression_box_cox_with_outliers', 'accuracy_type':type, 'accuracy_score':logistic_regression_scores.mean(), 'variance':logistic_regression_scores.var()}, index=[0])
scores=pd.concat([scores, df_row])
# Quadratic Discriminant Analysis model
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
model_qda = QuadraticDiscriminantAnalysis()
model_qda.fit(X, y)
for type in ['accuracy', 'precision', 'recall', 'f1']:
qda_scores=cross_val_score(model_qda, X, y, cv=10, scoring=type)
df_row=pd.DataFrame({'method':'QDA_box_cox_with_outliers', 'accuracy_type':type, 'accuracy_score':qda_scores.mean(), 'variance':qda_scores.var()}, index=[0])
scores=pd.concat([scores, df_row])
# Stochastic Gradient Descent model
from sklearn.linear_model import SGDClassifier
model_sgd = SGDClassifier(random_state=42)
model_sgd.fit(X, y)
for type in ['accuracy', 'precision', 'recall', 'f1']:
sgd_scores=cross_val_score(model_sgd, X, y, cv=10, scoring=type)
df_row=pd.DataFrame({'method':'SGD_box_cox_with_outliers', 'accuracy_type':type, 'accuracy_score':sgd_scores.mean(), 'variance':sgd_scores.var()}, index=[0])
scores=pd.concat([scores, df_row])
scores.sort_values(by='accuracy_score', ascending=False)
| method | accuracy_type | accuracy_score | variance | |
|---|---|---|---|---|
| 0 | Dummy_constant_1 | recall | 1.000000 | 0 |
| 0 | Ada_Boost_box_cox_with_outliers | accuracy | 0.722785 | 0.001157 |
| 0 | Random_Forest_box_cox_with_outliers | accuracy | 0.719611 | 0.001203 |
| 0 | Dummy_most_frequent | accuracy | 0.717391 | 0 |
| 0 | SVC_rbf_box_cox_with_outliers | accuracy | 0.716590 | 0.000021 |
| 0 | Decision_Tree_box_cox_with_outliers | accuracy | 0.711802 | 0.00269 |
| 0 | Logistic_Regression_box_cox_with_outliers | accuracy | 0.699078 | 0.001829 |
| 0 | Gaussian_NB_box_cox_with_outliers | recall | 0.696732 | 0.008663 |
| 0 | Neural_Network_box_cox_with_outliers | accuracy | 0.666667 | 0.018147 |
| 0 | QDA_box_cox_with_outliers | recall | 0.608824 | 0.03428 |
| 0 | SGD_box_cox_with_outliers | recall | 0.566667 | 0.065309 |
| 0 | Gaussian_NB_box_cox_with_outliers | accuracy | 0.561777 | 0.003879 |
| 0 | Ada_Boost_box_cox_with_outliers | precision | 0.547251 | 0.021315 |
| 0 | Decision_Tree_box_cox_with_outliers | precision | 0.540650 | 0.059391 |
| 0 | Random_Forest_box_cox_with_outliers | precision | 0.538889 | 0.090094 |
| 0 | SGD_box_cox_with_outliers | accuracy | 0.524680 | 0.019717 |
| 0 | QDA_box_cox_with_outliers | accuracy | 0.480927 | 0.000824 |
| 0 | Logistic_Regression_box_cox_with_outliers | precision | 0.475577 | 0.068937 |
| 0 | Gaussian_NB_box_cox_with_outliers | f1 | 0.474320 | 0.002451 |
| 0 | Dummy_constant_1 | f1 | 0.440678 | 0 |
| 0 | Neural_Network_box_cox_with_outliers | precision | 0.408558 | 0.038278 |
| 0 | SGD_box_cox_with_outliers | f1 | 0.390586 | 0.008212 |
| 0 | QDA_box_cox_with_outliers | f1 | 0.388818 | 0.00778 |
| 0 | Gaussian_NB_box_cox_with_outliers | precision | 0.362814 | 0.001931 |
| 0 | SGD_box_cox_with_outliers | precision | 0.325194 | 0.005431 |
| 0 | Ada_Boost_box_cox_with_outliers | f1 | 0.304584 | 0.009312 |
| 0 | Neural_Network_box_cox_with_outliers | f1 | 0.289932 | 0.020399 |
| 0 | Decision_Tree_box_cox_with_outliers | f1 | 0.289495 | 0.012696 |
| 0 | Neural_Network_box_cox_with_outliers | recall | 0.288889 | 0.068395 |
| 0 | QDA_box_cox_with_outliers | precision | 0.288568 | 0.00271 |
| 0 | Dummy_constant_1 | precision | 0.282609 | 0 |
| 0 | Dummy_constant_1 | accuracy | 0.282609 | 0 |
| 0 | Random_Forest_box_cox_with_outliers | f1 | 0.248681 | 0.011505 |
| 0 | Ada_Boost_box_cox_with_outliers | recall | 0.219281 | 0.00589 |
| 0 | Logistic_Regression_box_cox_with_outliers | f1 | 0.214305 | 0.011297 |
| 0 | Decision_Tree_box_cox_with_outliers | recall | 0.212092 | 0.008908 |
| 0 | Random_Forest_box_cox_with_outliers | recall | 0.167320 | 0.005487 |
| 0 | Logistic_Regression_box_cox_with_outliers | recall | 0.150980 | 0.006862 |
| 0 | Dummy_most_frequent | recall | 0.000000 | 0 |
| 0 | Dummy_most_frequent | f1 | 0.000000 | 0 |
| 0 | Dummy_most_frequent | precision | 0.000000 | 0 |
| 0 | SVC_rbf_box_cox_with_outliers | recall | 0.000000 | 0.0 |
| 0 | SVC_rbf_box_cox_with_outliers | f1 | 0.000000 | 0.0 |
| 0 | SVC_rbf_box_cox_with_outliers | precision | 0.000000 | 0.0 |
import seaborn as sns
sns.barplot(x='accuracy_score', y='method', data=scores, hue='accuracy_type')
<Axes: xlabel='accuracy_score', ylabel='method'>
Everything here is totally useless I think, because Dummy Classifier is one of the best models.
# Definiowanie granic dla outlierów
outliers_dict = {
'T_CLOTHING_6': 37000,
'R_CLOTHING_SAVINGS': 1.50,
'R_CLOTHING_DEBT': 1.0,
'R_EDUCATION_SAVINGS': 1.7,
'R_EDUCATION_DEBT': 0.3,
'R_ENTERTAINMENT_INCOME': 1.2,
'R_ENTERTAINMENT_SAVINGS': 6,
'R_ENTERTAINMENT_DEBT': 2.0,
'R_FINES_INCOME': 0.02,
'R_FINES_SAVINGS': 0.05,
'R_FINES_DEBT': 0.02,
'R_GAMBLING_INCOME': 0.2,
'R_GAMBLING_SAVINGS': 0.8,
'R_GAMBLING_DEBT': 0.15,
'R_GROCERIES_SAVINGS': 3.5,
'T_HEALTH_12': 40000,
'T_HEALTH_6': 25000,
'R_HEALTH_INCOME': 0.3,
'R_HEALTH_SAVINGS': 0.8,
'R_HOUSING_DEBT': 3,
'R_TAX_DEBT': 0.15
}
for col, threshold in outliers_dict.items():
df_without_outliers_bc_train = df_train2[df_train2[col] <= threshold]
df_without_outliers_bc_train.reset_index(drop=True, inplace=True)
df_without_outliers_bc_train
| INCOME | SAVINGS | DEBT | R_SAVINGS_INCOME | R_DEBT_INCOME | R_DEBT_SAVINGS | T_CLOTHING_12 | T_CLOTHING_6 | R_CLOTHING | R_CLOTHING_INCOME | ... | R_EXPENDITURE_SAVINGS | R_EXPENDITURE_DEBT | CAT_GAMBLING | CAT_DEBT | CAT_CREDIT_CARD | CAT_MORTGAGE | CAT_SAVINGS_ACCOUNT | CAT_DEPENDENTS | CREDIT_SCORE | DEFAULT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.619996 | 0.577458 | 1.439124 | 0.130956 | 1.553292 | 0.832731 | -0.047917 | 0.356240 | 0.956257 | -1.121007 | ... | -0.260731 | -1.127503 | 0 | 1 | 1 | 1 | 1 | 1 | -1.130432 | 0 |
| 1 | 0.817704 | 1.623598 | 1.377102 | 0.985366 | 1.247825 | 0.051701 | 0.205790 | 0.303502 | -0.069508 | -0.893278 | ... | -0.952011 | -1.007798 | 0 | 1 | 1 | 0 | 1 | 1 | -0.680778 | 0 |
| 2 | 0.595037 | -0.020955 | 0.861507 | -0.544491 | 0.728279 | 0.852186 | 0.527802 | 0.433736 | -0.397748 | 0.076292 | ... | 0.413988 | -0.794487 | 0 | 1 | 0 | 0 | 1 | 0 | -0.466063 | 0 |
| 3 | 0.639110 | -0.470106 | 0.824800 | -1.091724 | 0.628010 | 1.269785 | 1.436630 | 1.621757 | 1.021196 | 1.657935 | ... | 0.873646 | -0.932200 | 0 | 1 | 1 | 0 | 1 | 0 | -0.173222 | 1 |
| 4 | 0.773380 | 1.625310 | 0.676118 | 1.027970 | 0.279616 | -0.558965 | 0.521028 | 0.640860 | 0.198068 | -0.220086 | ... | -0.978246 | -0.382531 | 2 | 1 | 0 | 0 | 1 | 1 | -0.695687 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 308 | -0.805311 | 0.073590 | -1.080262 | 1.349440 | -1.176479 | -1.279739 | -0.860047 | -0.308425 | 1.727215 | -0.651681 | ... | -1.154654 | 1.536221 | 1 | 1 | 0 | 0 | 1 | 0 | -0.305739 | 0 |
| 309 | -0.967598 | -1.220451 | 0.160487 | -0.798347 | 1.916246 | 1.615881 | -0.612525 | -0.243081 | 0.582913 | 0.775811 | ... | 1.065847 | -1.153746 | 0 | 1 | 0 | 0 | 1 | 0 | -2.176996 | 0 |
| 310 | 0.042250 | 1.340068 | -0.367899 | 1.544719 | -0.702088 | -1.180194 | -0.326573 | -0.258194 | -0.478309 | -0.908676 | ... | -1.244763 | 0.817954 | 0 | 1 | 0 | 0 | 1 | 0 | 0.684931 | 0 |
| 311 | 0.840432 | 1.250887 | 1.106586 | 0.624554 | 0.841425 | 0.047313 | 0.587031 | 0.645190 | 0.052445 | -0.191677 | ... | -0.775283 | -0.860543 | 2 | 1 | 0 | 1 | 1 | 1 | -0.370573 | 0 |
| 312 | 0.420570 | -0.660180 | 0.396702 | -1.175901 | 0.184151 | 1.095918 | 0.951649 | -0.760120 | -1.909047 | 1.238277 | ... | 1.280243 | -0.382531 | 2 | 1 | 0 | 0 | 1 | 0 | 0.284337 | 0 |
313 rows × 86 columns
X2=df_without_outliers_bc_train.drop(target, axis=1)
y2= df_without_outliers_bc_train[target]
SVC
# SVC model
from sklearn.svm import SVC # another kernels than rbf and poly with 10 degrees are useless
from sklearn.metrics import accuracy_score
model = SVC(random_state=42, kernel='poly', degree=10)
model.fit(X2, y2)
y_pred = model.predict(df_val2.drop(target, axis=1))
raport=classification_report(df_val2[target], y_pred)
print(raport)
precision recall f1-score support
0 0.72 1.00 0.84 99
1 0.00 0.00 0.00 39
accuracy 0.72 138
macro avg 0.36 0.50 0.42 138
weighted avg 0.51 0.72 0.60 138
Random Forest model
# Random Forest model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X2, y2)
y_pred = model.predict(df_val2.drop(target, axis=1))
raport=classification_report(df_val2[target], y_pred)
print(raport)
precision recall f1-score support
0 0.72 0.23 0.35 99
1 0.28 0.77 0.41 39
accuracy 0.38 138
macro avg 0.50 0.50 0.38 138
weighted avg 0.60 0.38 0.37 138
X=df_train.drop(target, axis=1)
X_val=df_val.drop(target, axis=1)
X_val['CAT_GAMBLING'] = X_val['CAT_GAMBLING'].map({'No':0, 'Low':1, 'High':2})
# map CAT_GAMBLING (No-0, Low-1, High-2)
X['CAT_GAMBLING'] = X['CAT_GAMBLING'].map({'No':0, 'Low':1, 'High':2})
y=df_train[target]
SVC
# SVC model
from sklearn.svm import SVC # another kernels than rbf and poly with 10 degrees are useless
from sklearn.metrics import accuracy_score
model = SVC(random_state=42, kernel='poly', degree=10)
model.fit(X, y)
y_pred = model.predict(df_val2.drop(target, axis=1))
raport=classification_report(df_val2[target], y_pred)
print(raport)
precision recall f1-score support
0 0.72 1.00 0.84 99
1 0.00 0.00 0.00 39
accuracy 0.72 138
macro avg 0.36 0.50 0.42 138
weighted avg 0.51 0.72 0.60 138
Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X, y)
y_pred = model.predict(X_val)
raport=classification_report(df_val[target], y_pred)
print(raport)
precision recall f1-score support
0 0.75 0.90 0.82 99
1 0.47 0.23 0.31 39
accuracy 0.71 138
macro avg 0.61 0.56 0.56 138
weighted avg 0.67 0.71 0.67 138
# Definiowanie granic dla outlierów
outliers_dict = {
'T_CLOTHING_6': 37000,
'R_CLOTHING_SAVINGS': 1.50,
'R_CLOTHING_DEBT': 1.0,
'R_EDUCATION_SAVINGS': 1.7,
'R_EDUCATION_DEBT': 0.3,
'R_ENTERTAINMENT_INCOME': 1.2,
'R_ENTERTAINMENT_SAVINGS': 6,
'R_ENTERTAINMENT_DEBT': 2.0,
'R_FINES_INCOME': 0.02,
'R_FINES_SAVINGS': 0.05,
'R_FINES_DEBT': 0.02,
'R_GAMBLING_INCOME': 0.2,
'R_GAMBLING_SAVINGS': 0.8,
'R_GAMBLING_DEBT': 0.15,
'R_GROCERIES_SAVINGS': 3.5,
'T_HEALTH_12': 40000,
'T_HEALTH_6': 25000,
'R_HEALTH_INCOME': 0.3,
'R_HEALTH_SAVINGS': 0.8,
'R_HOUSING_DEBT': 3,
'R_TAX_DEBT': 0.15
}
for col, threshold in outliers_dict.items():
df_without_outliers_train = df_train[df_train[col] <= threshold]
df_without_outliers_train.reset_index(drop=True, inplace=True)
df_without_outliers_train
| INCOME | SAVINGS | DEBT | R_SAVINGS_INCOME | R_DEBT_INCOME | R_DEBT_SAVINGS | T_CLOTHING_12 | T_CLOTHING_6 | R_CLOTHING | R_CLOTHING_INCOME | ... | R_EXPENDITURE_SAVINGS | R_EXPENDITURE_DEBT | CAT_GAMBLING | CAT_DEBT | CAT_CREDIT_CARD | CAT_MORTGAGE | CAT_SAVINGS_ACCOUNT | CAT_DEPENDENTS | CREDIT_SCORE | DEFAULT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2783 | 1855 | 0 | 0.6665 | 0.0000 | 0.0000 | 103 | 74 | 0.7184 | 0.0370 | ... | 2.5003 | 0.0000 | No | 0 | 0 | 0 | 1 | 0 | 570 | 0 |
| 1 | 314430 | 445442 | 707468 | 1.4167 | 2.2500 | 1.5882 | 35861 | 29157 | 0.8131 | 0.1141 | ... | 0.5882 | 0.3704 | High | 1 | 0 | 1 | 1 | 0 | 691 | 0 |
| 2 | 161773 | 517674 | 2782496 | 3.2000 | 17.2000 | 5.3750 | 3716 | 2533 | 0.6816 | 0.0230 | ... | 0.3125 | 0.0581 | No | 1 | 1 | 1 | 1 | 1 | 520 | 0 |
| 3 | 16014 | 97685 | 20818 | 6.1000 | 1.3000 | 0.2131 | 637 | 187 | 0.2936 | 0.0398 | ... | 0.1639 | 0.7692 | No | 1 | 0 | 0 | 1 | 0 | 654 | 0 |
| 4 | 193225 | 1410542 | 2589215 | 7.3000 | 13.4000 | 1.8356 | 5276 | 2325 | 0.4407 | 0.0273 | ... | 0.1370 | 0.0746 | No | 1 | 1 | 0 | 1 | 1 | 552 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 484 | 20007 | 22508 | 457665 | 1.1250 | 22.8752 | 20.3334 | 1451 | 861 | 0.5934 | 0.0725 | ... | 1.1111 | 0.0546 | No | 1 | 0 | 0 | 1 | 0 | 423 | 0 |
| 485 | 20846 | 268679 | 23162 | 12.8888 | 1.1111 | 0.0862 | 1302 | 533 | 0.4094 | 0.0625 | ... | 0.0862 | 1.0000 | Low | 1 | 0 | 0 | 1 | 0 | 584 | 0 |
| 486 | 89235 | 1106514 | 160623 | 12.4000 | 1.8000 | 0.1452 | 2413 | 835 | 0.3460 | 0.0270 | ... | 0.0806 | 0.5556 | No | 1 | 0 | 0 | 1 | 0 | 631 | 0 |
| 487 | 197073 | 1021195 | 1863233 | 5.1818 | 9.4545 | 1.8246 | 8416 | 3948 | 0.4691 | 0.0427 | ... | 0.1754 | 0.0962 | High | 1 | 0 | 1 | 1 | 1 | 572 | 0 |
| 488 | 133592 | 85013 | 680103 | 0.6364 | 5.0909 | 8.0000 | 12476 | 264 | 0.0212 | 0.0934 | ... | 1.4286 | 0.1786 | High | 1 | 0 | 0 | 1 | 0 | 610 | 0 |
489 rows × 86 columns
X3=df_without_outliers_train.drop(target, axis=1)
y3= df_without_outliers_train[target]
X3['CAT_GAMBLING'] = X3['CAT_GAMBLING'].map({'No':0, 'Low':1, 'High':2})
SVC
# SVC model
from sklearn.svm import SVC # another kernels than rbf and poly with 10 degrees are useless
from sklearn.metrics import accuracy_score
model = SVC(random_state=42, kernel='poly', degree=10)
model.fit(X3, y3)
y_pred = model.predict(df_val2.drop(target, axis=1))
raport=classification_report(df_val2[target], y_pred)
print(raport)
precision recall f1-score support
0 0.72 1.00 0.84 99
1 0.00 0.00 0.00 39
accuracy 0.72 138
macro avg 0.36 0.50 0.42 138
weighted avg 0.51 0.72 0.60 138
Random Forest
model = RandomForestClassifier(random_state=42)
model.fit(X3, y3)
y_pred = model.predict(X_val)
raport=classification_report(df_val[target], y_pred)
print(raport)
precision recall f1-score support
0 0.74 0.90 0.81 99
1 0.41 0.18 0.25 39
accuracy 0.70 138
macro avg 0.57 0.54 0.53 138
weighted avg 0.64 0.70 0.65 138
For SVC data processing and transformation doesn't change accuracy_score, with RandomForestClassifier the accuracy_score is the highest when withour transormation.
# columns to remove from EDA
columns_to_remove = ["T_CLOTHING_12","T_ENTERTAINMENT_12", "T_GROCERIES_12", "T_GROCERIES_6", "T_HEALTH_12", "T_TAX_12", "T_TAX_6", "T_TRAVEL_12", "T_TRAVEL_6","T_UTILITIES_12", "T_UTILITIES_6", "T_EXPENDITURE_12", "T_EXPENDITURE_6"]
columns_to_remove
['T_CLOTHING_12', 'T_ENTERTAINMENT_12', 'T_GROCERIES_12', 'T_GROCERIES_6', 'T_HEALTH_12', 'T_TAX_12', 'T_TAX_6', 'T_TRAVEL_12', 'T_TRAVEL_6', 'T_UTILITIES_12', 'T_UTILITIES_6', 'T_EXPENDITURE_12', 'T_EXPENDITURE_6']
df_train=pd.read_csv('../data/for_modelling/credit_score_train.csv')
df_val=pd.read_csv('../data/for_modelling/credit_score_valid.csv')
# map CAT_GAMBLING (No-0, Low-1, High-2)
df_train['CAT_GAMBLING'] = df_train['CAT_GAMBLING'].map({'No':0, 'Low':1, 'High':2})
df_val['CAT_GAMBLING'] = df_val['CAT_GAMBLING'].map({'No':0, 'Low':1, 'High':2})
# remove columns with high correlation
df_train.drop(columns=columns_to_remove, inplace=True)
df_val.drop(columns=columns_to_remove, inplace=True)
# remove outliers
from pyod.models.knn import KNN
clf = KNN(contamination=0.04)
rest_cols = [col for col in df_train.columns if col not in columns_to_remove]
# remove DEFAULT column
rest_cols.remove('DEFAULT')
clf.fit(df_train[rest_cols])
df_train['outliers'] = clf.labels_
# sum of outliers
df_train['outliers'].value_counts()
outliers 0 470 1 20 Name: count, dtype: int64
df_train = df_train[df_train['outliers'] == 0]
df_train.drop(columns='outliers', inplace=True)
df_train.reset_index(drop=True, inplace=True)
df_train
| INCOME | SAVINGS | DEBT | R_SAVINGS_INCOME | R_DEBT_INCOME | R_DEBT_SAVINGS | T_CLOTHING_6 | R_CLOTHING | R_CLOTHING_INCOME | R_CLOTHING_SAVINGS | ... | R_EXPENDITURE_SAVINGS | R_EXPENDITURE_DEBT | CAT_GAMBLING | CAT_DEBT | CAT_CREDIT_CARD | CAT_MORTGAGE | CAT_SAVINGS_ACCOUNT | CAT_DEPENDENTS | CREDIT_SCORE | DEFAULT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2783 | 1855 | 0 | 0.6665 | 0.0000 | 0.0000 | 74 | 0.7184 | 0.0370 | 0.0555 | ... | 2.5003 | 0.0000 | 0 | 0 | 0 | 0 | 1 | 0 | 570 | 0 |
| 1 | 314430 | 445442 | 707468 | 1.4167 | 2.2500 | 1.5882 | 29157 | 0.8131 | 0.1141 | 0.0805 | ... | 0.5882 | 0.3704 | 2 | 1 | 0 | 1 | 1 | 0 | 691 | 0 |
| 2 | 161773 | 517674 | 2782496 | 3.2000 | 17.2000 | 5.3750 | 2533 | 0.6816 | 0.0230 | 0.0072 | ... | 0.3125 | 0.0581 | 0 | 1 | 1 | 1 | 1 | 1 | 520 | 0 |
| 3 | 16014 | 97685 | 20818 | 6.1000 | 1.3000 | 0.2131 | 187 | 0.2936 | 0.0398 | 0.0065 | ... | 0.1639 | 0.7692 | 0 | 1 | 0 | 0 | 1 | 0 | 654 | 0 |
| 4 | 193225 | 1410542 | 2589215 | 7.3000 | 13.4000 | 1.8356 | 2325 | 0.4407 | 0.0273 | 0.0037 | ... | 0.1370 | 0.0746 | 0 | 1 | 1 | 0 | 1 | 1 | 552 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 465 | 20007 | 22508 | 457665 | 1.1250 | 22.8752 | 20.3334 | 861 | 0.5934 | 0.0725 | 0.0645 | ... | 1.1111 | 0.0546 | 0 | 1 | 0 | 0 | 1 | 0 | 423 | 0 |
| 466 | 20846 | 268679 | 23162 | 12.8888 | 1.1111 | 0.0862 | 533 | 0.4094 | 0.0625 | 0.0048 | ... | 0.0862 | 1.0000 | 1 | 1 | 0 | 0 | 1 | 0 | 584 | 0 |
| 467 | 89235 | 1106514 | 160623 | 12.4000 | 1.8000 | 0.1452 | 835 | 0.3460 | 0.0270 | 0.0022 | ... | 0.0806 | 0.5556 | 0 | 1 | 0 | 0 | 1 | 0 | 631 | 0 |
| 468 | 197073 | 1021195 | 1863233 | 5.1818 | 9.4545 | 1.8246 | 3948 | 0.4691 | 0.0427 | 0.0082 | ... | 0.1754 | 0.0962 | 2 | 1 | 0 | 1 | 1 | 1 | 572 | 0 |
| 469 | 133592 | 85013 | 680103 | 0.6364 | 5.0909 | 8.0000 | 264 | 0.0212 | 0.0934 | 0.1468 | ... | 1.4286 | 0.1786 | 2 | 1 | 0 | 0 | 1 | 0 | 610 | 0 |
470 rows × 73 columns
X=df_train.drop(target, axis=1)
y=df_train[target]
X_val=df_val.drop(target, axis=1)
y_val=df_val[target]
# Box Cox and standart scalling
for col in rest_cols:
X[col] = boxcox(X[col]+1)[0]
X_val[col] = boxcox(X_val[col]+1)[0]
X[rest_cols] = scaler.fit_transform(X[rest_cols])
X_val[rest_cols] = scaler.transform(X_val[rest_cols])
Let's check the models
score=pd.DataFrame(columns=['method', 'accuracy_type', 'accuracy_score', 'variance'])
names=[
'SVC',
'Random_Forest',
'Decision_Tree',
'Ada_Boost',
'Gaussian_NB',
'Neural_Network',
'Logistic_Regression',
'QDA',
'SGD',
'Dummy_most_frequent',
]
classifiers=[
SVC(random_state=42, kernel='poly', degree=10),
RandomForestClassifier(random_state=42),
DecisionTreeClassifier(random_state=42, max_depth=5),
AdaBoostClassifier(random_state=42, algorithm='SAMME'),
GaussianNB(),
MLPClassifier(random_state=42, max_iter=1000),
LogisticRegression(random_state=42),
QuadraticDiscriminantAnalysis(),
SGDClassifier(random_state=42),
DummyClassifier(strategy="most_frequent")
]
for name, classifier in zip(names, classifiers):
classifier.fit(X, y)
for type in ['accuracy', 'precision', 'recall', 'f1']:
scores=cross_val_score(classifier, pd.concat([X,X_val]), pd.concat([y, y_val]), cv=10, scoring=type)
df_row=pd.DataFrame({'method':name, 'accuracy_type':type, 'accuracy_score':scores.mean(), 'variance':scores.var()}, index=[0])
score=pd.concat([score, df_row])
score.sort_values(by='accuracy_score', ascending=False)
| method | accuracy_type | accuracy_score | variance | |
|---|---|---|---|---|
| 0 | Dummy_most_frequent | accuracy | 0.717104 | 0.000040 |
| 0 | Neural_Network | accuracy | 0.713825 | 0.000755 |
| 0 | Random_Forest | accuracy | 0.713798 | 0.001097 |
| 0 | Logistic_Regression | accuracy | 0.708743 | 0.001495 |
| 0 | Ada_Boost | accuracy | 0.707186 | 0.000773 |
| 0 | SVC | accuracy | 0.705410 | 0.001291 |
| 0 | Decision_Tree | accuracy | 0.697240 | 0.001437 |
| 0 | Gaussian_NB | recall | 0.678758 | 0.008176 |
| 0 | Gaussian_NB | accuracy | 0.573689 | 0.003685 |
| 0 | QDA | recall | 0.556863 | 0.050690 |
| 0 | QDA | accuracy | 0.537869 | 0.003529 |
| 0 | SGD | recall | 0.530719 | 0.064501 |
| 0 | Random_Forest | precision | 0.514762 | 0.056325 |
| 0 | SGD | accuracy | 0.494372 | 0.012958 |
| 0 | Ada_Boost | precision | 0.484300 | 0.022240 |
| 0 | Gaussian_NB | f1 | 0.474327 | 0.001995 |
| 0 | Logistic_Regression | precision | 0.458651 | 0.022632 |
| 0 | Decision_Tree | precision | 0.455476 | 0.021991 |
| 0 | Neural_Network | precision | 0.392858 | 0.055516 |
| 0 | QDA | f1 | 0.387052 | 0.016288 |
| 0 | Gaussian_NB | precision | 0.368165 | 0.001808 |
| 0 | SGD | f1 | 0.358272 | 0.006583 |
| 0 | Logistic_Regression | f1 | 0.312510 | 0.016061 |
| 0 | QDA | precision | 0.303459 | 0.006640 |
| 0 | SGD | precision | 0.287123 | 0.003473 |
| 0 | Decision_Tree | f1 | 0.277330 | 0.005268 |
| 0 | Neural_Network | f1 | 0.276050 | 0.029940 |
| 0 | Logistic_Regression | recall | 0.243137 | 0.012859 |
| 0 | Ada_Boost | f1 | 0.237586 | 0.006305 |
| 0 | Random_Forest | f1 | 0.229153 | 0.007249 |
| 0 | Neural_Network | recall | 0.225163 | 0.024914 |
| 0 | Decision_Tree | recall | 0.208497 | 0.004412 |
| 0 | SVC | precision | 0.180000 | 0.100489 |
| 0 | Ada_Boost | recall | 0.168627 | 0.005840 |
| 0 | Random_Forest | recall | 0.150980 | 0.003522 |
| 0 | SVC | f1 | 0.091118 | 0.025519 |
| 0 | SVC | recall | 0.087908 | 0.026491 |
| 0 | Dummy_most_frequent | precision | 0.000000 | 0.000000 |
| 0 | Dummy_most_frequent | recall | 0.000000 | 0.000000 |
| 0 | Dummy_most_frequent | f1 | 0.000000 | 0.000000 |
import seaborn as sns
sns.barplot(x='accuracy_score', y='method', data=score, hue='accuracy_type')
<Axes: xlabel='accuracy_score', ylabel='method'>